Data Visualization 2

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)
library(p8105.datasets)
data("weather_df")

Recreating scatterplot from viz1

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5)
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Making labels

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package"
  )
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Adding scales

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15º C", "0", "15"))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Even more scale options

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_x_continuous(
    breaks = c(-15, 0, 15), 
    labels = c("-15ºC", "0", "15"),
    limits = c(-20, 30)) + 
  scale_y_continuous(
    trans = "sqrt", 
    position = "right")
## Warning in transformation$transform(x): NaNs produced
## Warning in scale_y_continuous(trans = "sqrt", position = "right"): sqrt
## transformation introduced infinite values.
## Warning: Removed 142 rows containing missing values or values outside the scale range
## (`geom_point()`).

Analogously to scale_x_* and scale_y_*, there are scales corresponding to other aesthetics. Some of the most common are used to control the color aesthetic. For example, arguments to scale_color_hue() control the color scale and the name in the plot legend.

weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package") + 
  scale_color_hue(h = c(100, 300))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Trying to create new color scheme is too hard, can use predownloaded.

ggp_temp_plot = 
  weather_df |> 
  ggplot(aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) + 
  labs(
    title = "Temperature plot",
    x = "Minimum daily temperature (C)",
    y = "Maxiumum daily temperature (C)",
    color = "Location",
    caption = "Data from the rnoaa package"
  ) + 
  viridis::scale_color_viridis(
    name = "Location", 
    discrete = TRUE
  )
#We used discrete = TRUE because the color aesthetic is mapped to a discrete variable. In other cases (for example, when color mapped to prcp) you can omit this argument to get a continuous color gradient. The viridis::scale_fill_viridis() function is appropriate for the fill aesthetic used in histograms, density plots, and elsewhere.


ggp_temp_plot
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Themes

Updating base plot…

ggp_temp_plot + 
  theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Adding data in geoms

central_park_df = 
  weather_df |> 
  filter(name == "CentralPark_NY")

molokai_df = 
  weather_df |> 
  filter(name == "Molokai_HI")

ggplot(data = molokai_df, aes(x = date, y = tmax, color = name)) + 
  geom_point() + 
  geom_line(data = central_park_df) 
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Patchwork Sometimes, though, you want to show two or three fundamentally different plots in the same graphic: you may want to juxtapose a scatterplot and a boxplot, or show scatterplots illustrating relationships between different variables. In this case, a solution is to create each of the panels you want separately and combine panels using tools in the patchwork package:

Make three plots and combine using patchwork

ggp_tmax_tmin = 
  weather_df |> 
  ggplot(aes(x=tmin, y =tmax, color = name)) +
  geom_point(alpha = 0.5)

ggp_prec_density = 
  weather_df |> 
  filter(prcp > 0) |> 
  ggplot(aes(x = prcp, fill =name))+
  geom_density(alpha=0.5)

ggp_temp_season = 
  weather_df |> 
  ggplot(aes(x=date, y=tmax, color=name))+
  geom_point(alpha = 0.5) +
  geom_smooth(se = FALSE) +
  theme(legend.position = "bottom")

(ggp_tmax_tmin + ggp_prec_density)/ggp_temp_season
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

Data manioulation

Let’s make temperature violin plots.

weather_df |> 
  mutate(name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_WA"))) |> #factor orders variables
  ggplot(aes(x=name,y=tmax,fill=name))+
  geom_violin(apla=0.5)
## Warning in geom_violin(apla = 0.5): Ignoring unknown parameters: `apla`
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

Used factoring above to reorder

weather_df |> 
  mutate(name=fct_reorder(name, tmax)) |> 
  ggplot(aes(x=name,y=tmax,fill=name))+
  geom_violin(alpha = 0.5)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

Data tidiness, creating it

pulse_df = 
  haven::read_sas("data_import_examples/public_pulse_data.sas7bdat") |> 
  janitor::clean_names() |> 
  pivot_longer(
    bdi_score_bl:bdi_score_12m,
    names_to = "visit", 
    names_prefix= "bdi_score_",
    values_to = "bdi"
    ) |> #The pivot_longer() function in R's tidyr package transforms data from a "wide" format to a "long" format. This means it takes multiple columns that represent different measurements or variables and converts them into two new columns: one containing the original column names (now as values), and another containing the corresponding values from those original columns.
  mutate(visit = fct_inorder(visit))

pulse_df |> 
  ggplot(aes(x=visit, y=bdi)) +
  geom_boxplot()
## Warning: Removed 879 rows containing non-finite outside the scale range
## (`stat_boxplot()`).